rm(list=ls(all=TRUE))
#options( scipen=999)
#install.packages("foreign")
library(foreign)
#install.packages("graphics")
library(graphics)
#install.packages("plyr")
library(plyr)
#library(list)
#install.packages("readstata13")
library(readstata13)
#install.packages("arm")
library(arm)

#set working directory

mydata<-read.dta13("Corruption&ForeignInvestment.dta")

################################################################################
############### Figure 1: Attractiveness of Business Environments ##############

mydata1<- na.omit(mydata[,c("exprandom1","q146","oinvest")])  

cdata.all<-ddply(mydata,c("exprandom1"), summarise, 
     N=sum(!is.na(q146)),
     mean=mean(q146,na.rm=T),
     se=sd(q146,na.rm=T)/sqrt(N)
     )
                   
cdata.o<-ddply(mydata[which(mydata$oinvest==1),],c("exprandom1"), summarise, 
     N=sum(!is.na(q146)),
     mean=mean(q146,na.rm=T),
     se=sd(q146,na.rm=T)/sqrt(N)
     )    
cdata.d<-ddply(mydata[which(mydata$oinvest==0),],c("exprandom1"), summarise, 
     N=sum(!is.na(q146)),
     mean=mean(q146,na.rm=T),
     se=sd(q146,na.rm=T)/sqrt(N)
     )

################################################################################

pdf(file="Figure1.pdf", width=3.25,height=2.2)
par(mfrow=c(1,1),mar=c(1.2,2.4,.5,.8))
n<-c(1:4)
plot(n-0.1, cdata.d[,3],axes =F, type="n", xlim=c(0.7,4.3), 
     ylim=c(1.6,4.2), ylab="Attractiveness of Business Environments", xlab="", 
     mgp = c(1.6,.6,0),cex.lab=.6, cex.axis=.6,cex=.7)
grid(col = "gray80")
box()

cnames<-cbind("Clean & Efficient \nGovernment (V1)",
              "Clean & Inefficient\nGovernment (V2) ",
              "Predictable    \nCorruption (V3) ",
              "Arbitrary     \nCorruption (V4)")

axis(2, at = seq(1.5,4, by = .5),mgp = c(1.6,.56,0),cex.axis = 0.6)
text(seq(1.5,4.5,by=1), y=1.30, cnames, xpd=TRUE,pos=2,cex=.45)

points(n-0.2, cdata.all[,3], pch=16,col="black",cex=.8)
segments(n-0.2,cdata.all[,3]-1.96*cdata.all[,4],
         n-0.2,cdata.all[,3]+1.96*cdata.all[,4],lwd=1,co="black")

points(n, cdata.d[,3], pch=17,cex=.8)
segments(n,cdata.d[,3]-1.96*cdata.d[,4],
         n,cdata.d[,3]+1.96*cdata.d[,4],lwd=1,lty=2)

points(n+0.2,cdata.o[,3],pch=15,col="black",cex=.8)
segments(n+0.2,cdata.o[,3]-1.96*cdata.o[,4],
         n+0.2,cdata.o[,3]+1.96*cdata.o[,4],lwd=1,col="black",lty=6)

legend("topright",c("Pooled","Domestic-Oriented Firms","Overseas Investors"),
       pch=c(16,17,15),lty=c(1,2,6),cex=.5,
       box.lwd=1,pt.cex=0.6,box.col = NA, bty="n")
dev.off()

################################################################################
############################### T-Tests ########################################
# V1 vs. V2 (Full Sample)
t.test(mydata$q146[mydata$exprandom1==1],mydata$q146[mydata$exprandom1==2],
       alternative = c("two.sided"), var.equal = T)

# V3 vs V4 (Full Sample)
t.test(mydata$q146[mydata$exprandom1==3],mydata$q146[mydata$exprandom1==4],
       alternative = c("two.sided"),var.equal = T)

# V2 vs V3 & V4 (Full Sample)
t.test(mydata$q146[mydata$exprandom1==2],
       mydata$q146[mydata$exprandom1==3|mydata$exprandom1==4],
       alternative = c("two.sided"), ,var.equal = T)

# V1: Domestic-Oriented Firms vs Overseas Investors
t.test(mydata$q146[mydata$exprandom1==1&mydata$oinvest==0],
       mydata$q146[mydata$exprandom1==1&mydata$oinvest==1],
       alternative = c("two.sided"), var.equal = T)

# V2 vs V3 & V4 (Domestic-Oriented Firms)
t.test(mydata$q146[mydata$exprandom1==2&mydata$oinvest==0],
       mydata$q146[(mydata$exprandom1==3|mydata$exprandom1==4)&mydata$oinvest==0],
       alternative = c("two.sided"), var.equal = T)

# V2 vs V3 & V4 (Overseas Investors)
t.test(mydata$q146[mydata$exprandom1==2&mydata$oinvest==1],
       mydata$q146[(mydata$exprandom1==3|mydata$exprandom1==4)&mydata$oinvest==1],
       alternative = c("two.sided"), var.equal = T)

# V3 vs V4 (Overseas Investors)
t.test(mydata$q146[mydata$exprandom1==3&mydata$oinvest==1],
       mydata$q146[mydata$exprandom1==4&mydata$oinvest==1],
       alternative = c("two.sided"), var.equal = T)

# V3 vs V4 (Domestic-Oriented Firms)
t.test(mydata$q146[mydata$exprandom1==3&mydata$oinvest==0],
       mydata$q146[mydata$exprandom1==4&mydata$oinvest==0],
       alternative = c("two.sided"), var.equal = T)

################################################################################
############ Graph function: execute lines 119--142 
gplot<-function(cdata,colnum1,colnum2,ylow,yhigh,xlabp){
par(mfrow=c(1,1),mar=c(1.2,2.4,.5,.8))
n<-c(1:4)
plot(n, cdata[n*2-1,3],xaxt="n", type="n",xlim=c(0.8,4.2), ylim=c(ylow,yhigh), 
     ylab="Attractiveness of Business Environments", xlab="", 
     mgp = c(1.6,.56,0),cex.lab=.6, cex.axis=.6,cex=.7)
grid(col = "gray80")
box()

cnames<-cbind("Clean & Efficient \nGovernment (V1)",
              "Clean & Inefficient\nGovernment (V2) ",
              "Predictable    \nCorruption (V3) ",
              "Arbitrary     \nCorruption (V4)")
            
#axis(2, at = seq(2,4, by = .5),mgp = c(1.6,.56,0),cex.axis = 0.6)
text(seq(1.5,4.5,by=1), y=xlabp, cnames, xpd=TRUE,pos=2,cex=.45)
points(n-0.1, cdata[n*2-1,colnum1], pch=16,col="black",cex=.8)
segments(n-0.1,cdata[n*2-1,colnum1]-1.96*cdata[n*2-1,colnum2],
         n-0.1,cdata[n*2-1,colnum1]+1.96*cdata[n*2-1,colnum2],lwd=1)
         
points(n+0.1, cdata[n*2,colnum1], pch=17,col="black",cex=.8)
segments(n+0.1,cdata[n*2,colnum1]-1.96*cdata[n*2,colnum2],
         n+0.1,cdata[n*2,colnum1]+1.96*cdata[n*2,colnum2],lwd=1,lty=2)
}

################################################################################
############## Figure 2a: Firm Heterogeneity (Sales in 2013)  ##################           
mydata2<-subset(mydata,oinvest==1)
                
mydata2.1<-na.omit(mydata2[,c("exprandom1","q146","highsales")]) 

cdata.sales<-ddply(mydata2.1,c("exprandom1","highsales"), summarise, 
     N=sum(!is.na(q146)),
     mean=mean(q146,na.rm=T),
     se=sd(q146,na.rm=T)/sqrt(N)
     )

pdf(file="Figure2a.pdf", width=3.25,height=2.2)
gplot(cdata.sales,colnum1=4,colnum2=5,ylow=1.5,yhigh=4.5,xlabp=1.1)
legend("topright",c("Low Sales","High Sales"),
       pch=c(16,17),lty=c(1,2),cex=.5,
       box.lwd=1,pt.cex=0.6,box.col = NA, bty="n")
dev.off()

################################################################################
################ Figure 2b: Firm Heterogeneity (atfp) ##########################

mydata2.2<-na.omit(mydata2[,c("exprandom1","q146","atfp2")]) 
cdata.atfp<-ddply(mydata2.2,c("exprandom1","atfp2"), summarise, 
                  N=sum(!is.na(q146)),
                  mean=mean(q146,na.rm=T),
                  se=sd(q146,na.rm=T)/sqrt(N)
)

pdf(file="Figure2b.pdf", width=3.25,height=2.2)
gplot(cdata.atfp,colnum1=4,colnum2=5,ylow=1.5,yhigh=4.5,xlabp=1.1)
legend("topright",c("Low ATFP","High ATFP"),
       pch=c(16,17),lty=c(1,2),cex=.5,
       box.lwd=1,pt.cex=0.6,box.col = NA, bty="n")
dev.off()

################################################################################
############################### T Tests ########################################
###### Predictable (V3) vs arbitrary corruption (V4)
# V3 vs V4 (Low sales )
t.test(mydata2$q146[mydata2$exprandom1==3&mydata2$highsales==0],
       mydata2$q146[mydata2$exprandom1==4&mydata2$highsales==0],
       alternative = c("two.sided"), var.equal = T)

# V3 vs V4 (High sales)
t.test(mydata2$q146[mydata2$exprandom1==3&mydata2$highsales==1],
       mydata2$q146[mydata2$exprandom1==4&mydata2$highsales==1],
       alternative = c("two.sided"), var.equal = T)

# V3 vs V4 (High sales): one-tailed t test
t.test(mydata2$q146[mydata2$exprandom1==3&mydata2$highsales==1],
       mydata2$q146[mydata2$exprandom1==4&mydata2$highsales==1],
       alternative = c("greater"), var.equal = T)

# V3 vs V4 (High ATFP)
t.test(mydata2$q146[mydata2$exprandom1==3&mydata2$atfp2==1],
       mydata2$q146[mydata2$exprandom1==4&mydata2$atfp2==1],
       alternative = c("two.sided"), var.equal = T)

# V3 vs V4 (Low ATFP)
t.test(mydata2$q146[mydata2$exprandom1==3&mydata2$atfp2==0],
       mydata2$q146[mydata2$exprandom1==4&mydata2$atfp2==0],
       alternative = c("two.sided"), var.equal = T)

###### A clean and inefficient govt (V2) vs predictable corruption (V3)
# V2 vs V3 (Low sales)
t.test(mydata2$q146[mydata2$exprandom1==2&mydata2$highsales==0],
       mydata2$q146[mydata2$exprandom1==3&mydata2$highsales==0],
       alternative = c("two.sided"), var.equal = T)

# V2 vs V3 (Low ATFP)
t.test(mydata2$q146[mydata2$exprandom1==2&mydata2$atfp2==0],
       mydata2$q146[mydata2$exprandom1==3&mydata2$atfp2==0],
       alternative = c("two.sided"), var.equal = T)

# V2 vs V3 (High sales)
t.test(mydata2$q146[mydata2$exprandom1==2&mydata2$highsales==1],
       mydata2$q146[mydata2$exprandom1==3&mydata2$highsales==1],
       alternative = c("two.sided"), var.equal = T)

# V2 vs V3 (High ATFP)
t.test(mydata2$q146[mydata2$exprandom1==2&mydata2$atfp2==1],
       mydata2$q146[mydata2$exprandom1==3&mydata2$atfp2==1],
       alternative = c("two.sided"), var.equal = T)

# V2 vs V3 (High ATFP): one-tailed t test
t.test(mydata2$q146[mydata2$exprandom1==2&mydata2$atfp2==1],
       mydata2$q146[mydata2$exprandom1==3&mydata2$atfp2==1],
       alternative = c("greater"), var.equal = T)

################################################################################
################ Figure 3: Firm Heterogeneity (Fixed Assets) ###################

mydata2.3<-na.omit(mydata2[,c("exprandom1","q146","capint")]) 

cdata.capint<-ddply(mydata2.3,c("exprandom1","capint"), summarise, 
     N=sum(!is.na(q146)),
     mean=mean(q146,na.rm=T),
     se=sd(q146,na.rm=T)/sqrt(N)
     )

pdf(file="Figure3.pdf", width=3.25,height=2.2)
gplot(cdata.capint,colnum1=4,colnum2=5,ylow=1,yhigh=4.5,xlabp=.6)
legend("topright",c("Low Fixed Assets","High Fixed Assets"),
       pch=c(16,17),lty=c(1,2),cex=.5,
       box.lwd=1,pt.cex=0.6,box.col = NA, bty="n")
dev.off()

################################################################################
############################### T Tests ########################################
###### Predictable (V3) vs arbitrary corruption (V4)
# V3 vs V4 (High fixed assets)
t.test(mydata2$q146[mydata2$exprandom1==3&mydata2$capint==1],
       mydata2$q146[mydata2$exprandom1==4&mydata2$capint==1],
       alternative = c("two.sided"), var.equal = T)

# V3 vs V4 (High fixed assets): one-tailed t test
t.test(mydata2$q146[mydata2$exprandom1==3&mydata2$capint==1],
       mydata2$q146[mydata2$exprandom1==4&mydata2$capint==1],
       alternative = c("greater"), var.equal = T)

# V3 vs V4 (Low fixed assets)
t.test(mydata2$q146[mydata2$exprandom1==3&mydata2$capint==0],
       mydata2$q146[mydata2$exprandom1==4&mydata2$capint==0],
       alternative = c("two.sided"), var.equal = T)

################################################################################
########## Figure 4: Probabilities of Entry and Majority Ownership #############
################################################################################
##### function to calculate predicted prob.: execute lines 275--283

yhat<-function(x1,x2,x3,fit){
  set.seed(11011)
  coefsim<-sim(fit,1000)
  coefmat<-coefsim@coef
  
  mediansmat<-c(1,x1,x2,x3)
  p<- pnorm(coefmat%*%as.vector(mediansmat))
  c(mean(p),quantile(p,c(0.025,0.975)))
}

##### function to calculate marginal effects: execute lines 286--296
meffect<-function(x1a,x2a,x3a,x1b,x2b,x3b,fit){
  set.seed(11011)
  coefsim<-sim(fit,1000)
  coefmat<-coefsim@coef
  mediansmat1<-c(1,x1a,x2a,x3a)
  p1<- pnorm(coefmat%*%as.vector(mediansmat1))
  mediansmat2<-c(1,x1b,x2b,x3b)
  p2<- pnorm(coefmat%*%as.vector(mediansmat2))
  p<-p1-p2
  c(mean(p1),mean(p2),mean(p),quantile(p,c(0.005,0.025,0.05,0.95,0.975,0.995)))
}

################################################################################
### graph function:  execute lines 300--334
gplot2<-function(fit1,fit2,ylow,yhigh,xlabp){
  yhat1a<-yhat(1,0,0,fit1)
  yhat1b<-yhat(0,1,0,fit1)
  yhat1c<-yhat(0,0,1,fit1)
  yhat1d<-yhat(0,0,0,fit1)
  
  yhat2a<-yhat(1,0,0,fit2)
  yhat2b<-yhat(0,1,0,fit2)
  yhat2c<-yhat(0,0,1,fit2)
  yhat2d<-yhat(0,0,0,fit2)
  
  data1<-rbind(yhat1a,yhat1b,yhat1c,yhat1d)
  data2<-rbind(yhat2a,yhat2b,yhat2c,yhat2d)
  
  par(mfrow=c(1,1),mar=c(1.2,2.4,.5,.8))
  n<-c(1:4)
  plot(n, data1[,1],xaxt="n", type="n",xlim=c(0.8,4.2), ylim=c(ylow,yhigh), 
       ylab="Predicted Probabilities", xlab="", 
       mgp = c(1.6,.56,0),cex.lab=.6, cex.axis=.6,cex=.7)
  grid(col = "gray80")
  box()
  
  cnames<-cbind("Clean & Efficient \nGovernment (V1)",
                "Clean & Inefficient\nGovernment (V2) ",
                "Predictable    \nCorruption (V3) ",
                "Arbitrary     \nCorruption (V4)")
  
  #axis(2, at = seq(2,4, by = .5),mgp = c(1.6,.56,0),cex.axis = 0.6)
  text(seq(1.5,4.5,by=1), y=xlabp, cnames, xpd=TRUE,pos=2,cex=.45)
  points(n-0.15, data1[,1], pch=16,col="black",cex=.8)
  segments(n-0.15,data1[,2], n-0.15,data1[,3],lwd=1)
  
  points(n+0.15, data2[,1], pch=17,col="black",cex=.8)
  segments(n+0.15,data2[,2], n+0.15,data2[,3],lwd=1,lty=2)
}

### Fit a probit model of entry
fit1.1<-glm(entry~g1+g2+g3,family=binomial(link="probit"),
            data=mydata2,x=T)

### Fit a probit model of majority ownership
fit1.2<-glm(MajOwn~g1+g2+g3,family=binomial(link="probit"),
            data=mydata2,x=T)


pdf(file="Figure4.pdf", width=3.25,height=2.2)
gplot2(fit1.1,fit1.2,ylow=0.1,yhigh=1,xlabp=0)
legend("bottomright",c("Entry","Majority Ownership"),
       pch=c(16,17),lty=c(1,2),cex=.5,
       box.lwd=1,pt.cex=0.6,box.col = NA, bty="n")
dev.off()

############ predicted probability of market entry
# V1: a clean and efficient govt
yhat(1,0,0,fit1.1)
# V3: predictable corruption
yhat(0,0,1,fit1.1)
# V4: arbitrary corruption
yhat(0,0,0,fit1.1)

# marginal effect: V3 vs V4
meffect(0,0,1,0,0,0,fit1.1)

# marginal effect: V2 vs V3
meffect(0,1,0,0,0,1,fit1.1)

############ predicted probability of majority ownership
# V1: a clean and efficient govt
yhat(1,0,0,fit1.2)
# V3: predictable corruption
yhat(0,0,1,fit1.2)
# V2: a clean and inefficient govt
yhat(0,1,0,fit1.2)
# V4: arbitrary corruption
yhat(0,0,0,fit1.2)

# marginal effect: V2 vs V4
meffect(0,1,0,0,0,0,fit1.2)

################Figure 5: Heckman Selection Model #######################
### Read simulated coefs from the heckman selection model 
coef.heck<-read.dta13("sim_heck.dta")
coef.heck.e<-coef.heck[,c(5:10)]
coef.heck.m<-coef.heck[,c(1:4)]

yhat.e<-function(x1,x2,x3,coef.sim){
  xmat<-as.vector(c(x1,x2,x3,1,0,1))
  y.pred<-pnorm(as.matrix(coef.sim)%*%xmat)
  c(mean(y.pred),quantile(y.pred,c(0.025,0.975)))
}

yhat.m<-function(x1,x2,x3,coef.sim){
  xmat<-as.vector(c(x1,x2,x3,1))
  y.pred<-pnorm(as.matrix(coef.sim)%*%xmat)
  c(mean(y.pred),quantile(y.pred,c(0.025,0.975)))
}

yhat.e.a<-yhat.e(1,0,0,coef.heck.e)
yhat.e.b<-yhat.e(0,1,0,coef.heck.e)
yhat.e.c<-yhat.e(0,0,1,coef.heck.e)
yhat.e.d<-yhat.e(0,0,0,coef.heck.e)

yhat.m.a<-yhat.m(1,0,0,coef.heck.m)
yhat.m.b<-yhat.m(0,1,0,coef.heck.m)
yhat.m.c<-yhat.m(0,0,1,coef.heck.m)
yhat.m.d<-yhat.m(0,0,0,coef.heck.m)

data1<-rbind(yhat.e.a,yhat.e.b,yhat.e.c,yhat.e.d)
data2<-rbind(yhat.m.a,yhat.m.b,yhat.m.c,yhat.m.d)

pdf(file="Figure5.pdf", width=3.25,height=2.2)
par(mfrow=c(1,1),mar=c(1.2,2.4,.5,.8))
n<-c(1:4)
plot(n, data1[,1],xaxt="n", type="n",xlim=c(0.8,4.2), ylim=c(0.05,1), 
     ylab="Predicted Probabilities", xlab="", 
     mgp = c(1.6,.56,0),cex.lab=.6, cex.axis=.6,cex=.7)
grid(col = "gray80")
box()

cnames<-cbind("Clean & Efficient \nGovernment (V1)",
              "Clean & Inefficient\nGovernment (V2) ",
              "Predictable    \nCorruption (V3) ",
              "Arbitrary     \nCorruption (V4)")

axis(2, at = seq(0.2, 1, by = .2),mgp = c(1.6,.56,0),cex.axis = 0.6)
text(seq(1.5,4.5,by=1), y=-0.06, cnames, xpd=TRUE,pos=2,cex=.45)

points(n-0.15, data1[,1], pch=16,col="black",cex=.8)
segments(n-0.15,data1[,2],n-0.15,data1[,3],lwd=1,co="black")

points(n+.15, data2[,1], pch=17,col="black",cex=.8)
segments(n+.15,data2[,2],n+.15,data2[,3],lwd=1,co="black",lty=2)

legend("bottomleft",c("Entry","Majority Ownership"),
       pch=c(16,17),lty=c(1,2,6),cex=.5,
       box.lwd=1,pt.cex=0.6,box.col = NA, bty="n")
dev.off()

#################################################################################
##################Figure 6: Replication Experiment
rm(list=ls(all=TRUE))
mydata<-read.dta13("ReplicationExperiment.dta")

mydata1<- na.omit(mydata[,c("ExpRandom1","Q146","oinvest")])  

cdata<-ddply(mydata1,c("ExpRandom1"), summarise, 
                 N=sum(!is.na(Q146)),
                 mean=mean(Q146,na.rm=T),
                 se=sd(Q146,na.rm=T)/sqrt(N)
)


pdf(file="Figure6.pdf", width=3.25,height=2.2)
par(mfrow=c(1,1),mar=c(1.2,2.4,.5,.8))
n<-c(1:4)
plot(n-0.1, cdata[,3],axes =F, type="n", xlim=c(0.7,4.3), 
     ylim=c(2,4.3), ylab="Attractiveness of Business Environments", xlab="", 
     mgp = c(1.6,.6,0),cex.lab=.6, cex.axis=.6,cex=.7)
grid(col = "gray80")
box()

cnames<-cbind("Clean & Efficient \nGovernment (V1)",
              "Clean & Inefficient\nGovernment (V2) ",
              "Predictable    \nCorruption (V3) ",
              "Arbitrary     \nCorruption (V4)")

axis(2, at = seq(2,4, by = .5),mgp = c(1.6,.56,0),cex.axis = 0.6)
text(seq(1.5,4.5,by=1), y=1.70, cnames, xpd=TRUE,pos=2,cex=.45)


points(n, cdata[,3], pch=16,cex=.8)
segments(n,cdata[,3]-1.96*cdata[,4],
         n,cdata[,3]+1.96*cdata[,4],lwd=1,lty=1)

dev.off()

################## T Test #######################################################
# V2 vs V3
t.test(mydata$Q146[mydata$ExpRandom1==2],mydata$Q146[mydata$ExpRandom1==3],
       alternative = c("greater"), var.equal = T)
